import pandas as pd
import numpy as np
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
sns.set (style="white")
sns.set (style="whitegrid", color_codes=True)
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from IPython.display import Image
import pydotplus as pydot
import graphviz
from IPython.display import display
from sklearn import tree
from os import system
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, precision_score, f1_score, roc_auc_score,accuracy_score,roc_curve
from sklearn.metrics import classification_report
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import AdaBoostClassifier
#Read the CSV file
Bank_DF=pd.read_csv('bank-full.csv')
Bank_DF.head(10)
Bank_DF.info()
Bank_DF.tail(10)
Bank_DF.info()
Bank_DF.isnull().sum()
Bank_DF.describe().transpose()
Bank_DF.describe()
Bank_DF.nunique()
#Convert to categorical data type
#Categoricals are a pandas data type corresponding to categorical variables in statistics.
#A categorical variable takes on a limited, and usually fixed, number of possible values
#A string variable consisting of only a few different values.
#Converting such a string variable to a categorical variable will save some memory
Bank_DF['job']=Bank_DF['job'].astype('category')
Bank_DF['marital']=Bank_DF['marital'].astype('category')
Bank_DF['education']=Bank_DF['education'].astype('category')
Bank_DF['default']=Bank_DF['default'].astype('category')
Bank_DF['poutcome']=Bank_DF['poutcome'].astype('category')
Bank_DF['Target']=Bank_DF['Target'].astype('category')
Bank_DF['housing']=Bank_DF['housing'].astype('category')
Bank_DF['loan']=Bank_DF['loan'].astype('category')
Bank_DF['contact']=Bank_DF['contact'].astype('category')
Bank_DF['month']=Bank_DF['month'].astype('category')
# new category datatypes
Bank_DF.dtypes
#Deliverable – 1 (Exploratory data quality report reflecting the following) – (20)
#Univariate analysis (12 marks)
#Age
plt.figure(figsize=(14,5))
sns.distplot(Bank_DF['age']);
# Check unique values
Bank_DF['age'].unique()
# Check null
Bank_DF['age'].isnull().sum()
# Check mean, min, max, std, quartiles
Bank_DF['age'].describe()
#Median value
Bank_DF['age'].median()
print('Mean for Age is 40.936210')
print('Median (Q2) for Age is 39')
print('Very small difference between mean and median')
print('Min value for Age is 18')
print('Max value for Age is 95')
print('Q1 for Age is 33')
print('Q2 for Age is 39')
print('Q3 for Age is 48')
outliers_lower=33-1.5*(48-33)
outliers_upper=48+1.5*(48-33)
print(outliers_lower)
print(outliers_upper)
# Number of outliers
# Lower
print('Number of outliers lower:',Bank_DF[Bank_DF['age']<outliers_lower]['age'].count())
# Upper
print('Number of outliers upper:',Bank_DF[Bank_DF['age']>outliers_upper]['age'].count())
plt.figure(figsize=(3,5))
sns.boxplot(x='age',data=Bank_DF, orient='v');
#Analysis for age
print('The minimum age is 18 and maximum age is 95 so the spread is across')
print('There is a small variance between mean and median values')
print('We do not see any null values')
print('The data for age is right skewed as can be seen from distplot and boxplot above')
print('There are outliers but they seem to be real world situation')
#Job
plt.figure(figsize=(20,10))
sns.countplot(Bank_DF['job'],data=Bank_DF)
plt.xlabel('Job', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
#Check unique values
Bank_DF['job'].unique()
#Check null
Bank_DF['job'].isnull().any()
Bank_DF['job'].value_counts()
#Value counts percentage|
Bank_DF['job'].value_counts(normalize=True)
#Analysis for job
print('For 288 customers the job information is unknown')
print('There are more customers in blue-collar (21.5%) and management (21%) jobs')
print('There are no null values')
#Education
plt.figure(figsize=(5,5))
sns.countplot(Bank_DF['education'],data=Bank_DF)
plt.xlabel('Education', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
#Check unique values
Bank_DF['education'].unique()
#Check null
Bank_DF['education'].isnull().any()
#Value counts
Bank_DF['education'].value_counts()
#Value counts percentage
Bank_DF['education'].value_counts(normalize=True)
#Analysis for education
print('There are maximum customers with secondary education followed by tertiary')
print('There are 1857 customers for which education details are not available')
print('There are no null values')
#Housing Loan
plt.figure(figsize=(5,5))
sns.countplot(Bank_DF['housing'],data=Bank_DF)
plt.xlabel('Housing Loan', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
#Check unique values
Bank_DF['housing'].unique()
#Check null
Bank_DF['housing'].isnull().any()
#Value counts
Bank_DF['housing'].value_counts()
#Value counts percentage
Bank_DF['housing'].value_counts(normalize=True)
#Analysis for housing loan
print('There are more customers who have housing loan')
print('There are no null values')
#Personal Loan
plt.figure(figsize=(5,5))
sns.countplot(Bank_DF['loan'],data=Bank_DF)
plt.xlabel('Personal Loan', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
#Check unique values
Bank_DF['loan'].unique()
#Check null
Bank_DF['loan'].isnull().any()
#Value counts
Bank_DF['loan'].value_counts()
#Value counts percentage
Bank_DF['loan'].value_counts(normalize=True)
#Analysis for personal loan
print('There are more customers (~84%) who do not have personal loan')
print('There are no null values')
#Credit in default
plt.figure(figsize=(5,5))
sns.countplot(Bank_DF['default'],data=Bank_DF)
plt.xlabel('Credit in Default', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
#Check unique values
Bank_DF['default'].unique()
#Check null
Bank_DF['default'].isnull().any()
#Value counts
Bank_DF['default'].value_counts()
#Value counts percentage
Bank_DF['default'].value_counts(normalize=True)
#Analysis for credit in default
print('There are only 815 customers with credit in default')
print('Approximately 98% customers do not have credit in default')
print('There are no null values')
#Marital
plt.figure(figsize=(5,5))
sns.countplot(Bank_DF['marital'],data=Bank_DF)
plt.xlabel('Marital Status', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
#Check unique values
Bank_DF['marital'].unique()
#Check null
Bank_DF['marital'].isnull().any()
#Value counts
Bank_DF['marital'].value_counts()
#Value counts percentage
Bank_DF['marital'].value_counts(normalize=True)
#Analysis for marital
print('There are more married customers followed by singles')
print('There are no null values')
#Balance in account
plt.figure(figsize=(14,5))
sns.distplot(Bank_DF['balance']);
#Check null
Bank_DF['balance'].isnull().any()
#Check mean, min, max, std, quartiles
Bank_DF['balance'].describe()
#Median value
Bank_DF['balance'].median()
print('Min value for balance is -8019')
print('Max value for balance is 102127')
print('Mean for balance is 1362.272058')
print('Median or Q2 for balance is 448.9')
print('Q1 for balance is 72')
print('Q2 for balance is 448')
print('Q3 for balance is 1428')
# Q1 and Q3 values from above
outliers_lower=72-1.5*(1428-72)
outliers_upper=1428+1.5*(1428-72)
print(outliers_lower)
print(outliers_upper)
# Number of outliers
#Lower
print('Number of outliers lower:',Bank_DF[Bank_DF['balance']<outliers_lower]['balance'].count())
#Upper
print('Number of outliers upper:',Bank_DF[Bank_DF['balance']>outliers_upper]['balance'].count())
plt.figure(figsize=(8,8))
sns.boxplot(x='balance',data=Bank_DF, orient='v');
#Analysis for Balance
print('The minimum balance is -8019 and maximum balance is 102127')
print('There is high variance between mean and median values')
print('We do not see any null values')
print('The data for balance is highly right skewed as can be seen from distplot and boxplot above')
print('We can see the upper boundary for outliers is 3462 and there are 4712 outliers in upper boundary')
#Contact type
plt.figure(figsize=(5,5))
sns.countplot(Bank_DF['contact'],data=Bank_DF)
plt.xlabel('Contact Type', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
#Check unique values
Bank_DF['contact'].unique()
#Check null
Bank_DF['contact'].isnull().any()
#Value counts
Bank_DF['contact'].value_counts()
#Value counts percentage
Bank_DF['contact'].value_counts(normalize=True)
#Analysis for contact type
print('There are more customers (~65%) who were contacted via mobile')
print('However for 13020 customers the contact type is unknown')
print('There are no null values')
#Last contact: day
# Check unique values
Bank_DF['day'].unique()
# Check null
Bank_DF['day'].isnull().any()
# value counts
Bank_DF['day'].value_counts()
# value counts
Bank_DF['day'].value_counts(normalize=True)
#Last contact: month
#Check unique values
Bank_DF['month'].unique()
#Check null
Bank_DF['month'].isnull().any()
#Value counts
Bank_DF['month'].value_counts()
#Value counts percentage
Bank_DF['month'].value_counts(normalize=True)
#Analysis for contact month
print('Max contacts were made in the month of May and max contacts ranges from May to August')
print('There are no null values')
#Campaign
plt.figure(figsize=(14,5))
sns.distplot(Bank_DF['campaign']);
#Check null
Bank_DF['campaign'].isnull().any()
#Check mean, min, max, std, quartiles
Bank_DF['campaign'].describe()
#Median value
Bank_DF['campaign'].median()
print('Mean for campaign is 2.763841')
print('Median or Q2 for campaign is 2')
print('Min value for campaign is 1')
print('Max value for campaign is 63')
print('Q1 for campaign is 1')
print('Q2 for campaign is 2')
print('Q3 for campaign is 3')
print('In the plot we saw the data is right skewed')
# Q1 and Q3 values from above
outliers_lower=1-1.5*(3-1)
outliers_upper=3+1.5*(3-1)
print(outliers_lower)
print(outliers_upper)
# Number of outliers
#Lower
print('Number of outliers lower:',Bank_DF[Bank_DF['balance']<outliers_lower]['balance'].count())
#Upper
print('Number of outliers upper:',Bank_DF[Bank_DF['balance']>outliers_upper]['balance'].count())
#Analysis Campaign
print('There is not much difference between mean and median values')
print('Minimum and Maximum for campaign are 1 and 63')
print('Data is highly right skewed')
#pdays: Days since last contact
#Check null
Bank_DF['pdays'].isnull().any()
#Check mean, min, max, std, quartiles
Bank_DF['pdays'].describe()
#Median value
Bank_DF['pdays'].median()
print('pdays is -1: {}'.format(Bank_DF[Bank_DF.pdays==-1].shape[0]))
print ('Most of records for pdays value are -1 which means that customer has never been contacted')
#Number of contacts performed
#Check null
Bank_DF['previous'].isnull().any()
#Check mean, min, max, std, quartiles
Bank_DF['previous'].describe()
#Median value
Bank_DF['previous'].median()
#Previous Campaign
plt.figure(figsize=(5,5))
sns.countplot(Bank_DF['poutcome'],data=Bank_DF)
plt.xlabel('Previous Campaign Outcome', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
#Check unique values
Bank_DF['poutcome'].unique()
#Check null
Bank_DF['poutcome'].isnull().any()
#Value counts
Bank_DF['poutcome'].value_counts()
#Value counts percentage
Bank_DF['poutcome'].value_counts(normalize=True)
print ('The success rate from previous campaign was around 3% and for a majority of customers it is unknown (81.7%)')
#Target
plt.figure(figsize=(5,5))
sns.countplot(Bank_DF['Target'],data=Bank_DF)
plt.xlabel('Did Client Subscribed Term Deposit', fontsize=15)
plt.ylabel('Count', fontsize=15)
plt.show();
#Check unique values
Bank_DF['Target'].unique()
#Check null
Bank_DF['Target'].isnull().any()
#Value counts
Bank_DF['Target'].value_counts()
#Value counts percentage
Bank_DF['Target'].value_counts(normalize=True)
print('Around 12% customers subscribed to term deposits')
#Data challenges
#We have seen unknown/mising data for job and education which can impact our target variables
# Let's check job
Bank_DF['job'].value_counts()
#We see for 288 customers we do not have their job information
#The number of customers who are above 60
Bank_DF['job'][Bank_DF['age']>60][Bank_DF['job']=='unknown'].value_counts()
#22 customers have no job information and are above 60 Therefore we can move them to retired category
Bank_DF.loc[(Bank_DF['age']>60) & (Bank_DF['job']=='unknown'), 'job'] = 'retired'
#Comparing job with education to check if there a corerelation
pd.crosstab(Bank_DF['job'], Bank_DF['education'])
#In case a person has primary education only then mostly they have blue-collar jobs
#We can also see a person with tertiary education mostly have management jobs
#We are unable to determine undertstanding for people with secondary education as the spread is across technician, blue-collar and admin jobs
#Moving customers with primary education and unknown job to blue-collar job and customers with tertiary education and unknown job to management job
Bank_DF.loc[(Bank_DF['education']=='primary') & (Bank_DF['job']=='unknown'), 'job'] = 'blue-collar'
Bank_DF.loc[(Bank_DF['education']=='tertiary') & (Bank_DF['job']=='unknown'), 'job'] = 'management'
Bank_DF
pd.crosstab(Bank_DF['education'], Bank_DF['job'])
#We can conclude below understanding
#People in admin job mostly have secondary education
#People in management job mostly have tertiary education
#People in services job mostly have secondary education
#People in technician job mostly have secondary education
#People in housemaid job mostly have primary education
Bank_DF.loc[(Bank_DF['job']=='admin.') & (Bank_DF['education']=='unknown'), 'education'] = 'secondary'
Bank_DF.loc[(Bank_DF['job']=='management') & (Bank_DF['education']=='unknown'), 'education'] = 'tertiary'
Bank_DF.loc[(Bank_DF['job']=='services') & (Bank_DF['education']=='unknown'), 'education'] = 'secondary'
Bank_DF.loc[(Bank_DF['job']=='technician') & (Bank_DF['education']=='unknown'), 'education'] = 'secondary'
Bank_DF.loc[(Bank_DF['job']=='housemaid') & (Bank_DF['education']=='unknown'), 'education'] ='primary'
Bank_DF
# We can combine admin into management, housemaid into blue-collar and self employed with entrepreneur
Job_mapping = {'admin.':'management', 'housemaid':'blue-collar', 'self-employed':'entrepreneur'}
Bank_DF['job'] = Bank_DF['job'].replace(Job_mapping)
Bank_DF['job'].value_counts()
Bank_DF['job'].value_counts(normalize=True)
Bank_DF['job']=Bank_DF['job'].astype('category')
Bank_DF
#pdays had -1 for a many records, meaning the customer has never been contacted
pd.crosstab(Bank_DF['pdays'],Bank_DF['poutcome'])
Bank_DF
#Outliers
print('The minimum balance is -8019 and maximum balance is 102127')
print('There is much variance between mean and median values and we do not see any null values')
print('The data for balance is highly right skewed')
print('Number of outliers upper: 4712')
print('Upper boundary for outliers: 3462')
plt.figure(figsize=(14,5))
sns.distplot(Bank_DF['balance']);
print ('Negative balances in the dataset, we can update them as zero balance as customers with higher balance only will choose for term deposits. Converting negative balances to zero.')
Bank_DF.loc[Bank_DF.balance<0,'balance'] = 0
Bank_DF.loc[Bank_DF.balance>3462,'balance'] = 3462
#Multivariate analysis
plt.figure(figsize=(20,5))
sns.pairplot(Bank_DF, diag_kind='kde', diag_kws={'bw':'1.0'})
plt.show();
plt.figure(figsize=(10,8))
sns.heatmap(Bank_DF.corr(),
annot=True,
linewidths=.5,
center=0,
cbar=False)
plt.show();
# Target and Balance
sns.barplot(x='Target', y='balance', data=Bank_DF)
plt.show();
print('customers with higher balance have more term deposits')
#Target and duration
sns.barplot(x='Target', y='duration', data=Bank_DF)
plt.show();
#If call is longer more customers have subscribed
#Dropping duration as it's value will be known after call and doesn't seem impact target variable
Bank_DF=Bank_DF.drop(['duration'],axis=1)
#Target and job
pd.crosstab(Bank_DF['job'], Bank_DF['Target'], normalize='index')
plt.figure(figsize=(20,5))
sns.countplot(x='job',hue='Target', data=Bank_DF);
print('The highest percentages for customers accepting term deposit are:')
print('#Student subscribed to term deposit = 28.67%')
print('#Retired subscribed to term deposit = 22.79%')
print('#Unempld subscribed to term deposit = 15.50%')
print('#People with job management have more term deposits followed by technician and blue-collar')
#Target and marital
pd.crosstab(Bank_DF['marital'], Bank_DF['Target'], normalize='index')
sns.countplot(x='marital',hue='Target', data=Bank_DF);
print('The highest percentages for customers accepting term deposit are:')
print('Single subscribed to term deposit = 14.94%')
print('Divorced subscribed to term deposit = 11.94%')
print('Overall married customers have more term deposits followed by single customers')
#Target and Education
pd.crosstab(Bank_DF['education'], Bank_DF['Target'], normalize='index')
sns.countplot(x='education',hue='Target', data=Bank_DF);
print('The highest percentages for customers accepting term deposit are:')
print('Customers with Tertiary(higher) education subscribed to term deposit = 15%')
print('Next highest percentage is for customers for whom we do not know their education = 13.57%')
print('Overall customers with secondary education have more term deposits followed by tertiary education')
#Target and default
pd.crosstab(Bank_DF['default'], Bank_DF['Target'], normalize='index')
sns.countplot(x='default',hue='Target', data=Bank_DF);
print('The customers who do not default on credit have a higher percentage of accepting term deposit')
#Target and Housing
pd.crosstab(Bank_DF['housing'], Bank_DF['Target'], normalize='index')
sns.countplot(x='housing',hue='Target', data=Bank_DF);
print('The customers who do not have home loan had higher %age accepting term deposits')
#Target and Personal Loan
pd.crosstab(Bank_DF['loan'], Bank_DF['Target'], normalize='index')
sns.countplot(x='loan',hue='Target', data=Bank_DF);
print('The customers who do not have personal loan had higher ppercentage accepting term deposit')
#Target and Contact
pd.crosstab(Bank_DF['contact'], Bank_DF['Target'])
sns.countplot(x='contact',hue='Target', data=Bank_DF);
print('There are 13020 records where contact is Unkown.')
print('From that count 530 have Target variable as yes.')
print('The majority where the Target variable is yes when the contact type was cellular.')
#We can move unknown contacts where Target is yes to contact as cellular
Bank_DF.loc[(Bank_DF['contact']=='unknown') & (Bank_DF['Target']=='yes'), 'contact'] = 'cellular'
Bank_DF
#Encode categorical variables
labelEncod = preprocessing.LabelEncoder()
Bank_DF.job = labelEncod.fit_transform(Bank_DF.job)
Bank_DF.marital = labelEncod.fit_transform(Bank_DF.marital)
Bank_DF.education = labelEncod.fit_transform(Bank_DF.education)
Bank_DF.default = labelEncod.fit_transform(Bank_DF.default)
Bank_DF.housing = labelEncod.fit_transform(Bank_DF.housing)
Bank_DF.loan = labelEncod.fit_transform(Bank_DF.loan)
Bank_DF.contact = labelEncod.fit_transform(Bank_DF.contact)
Bank_DF.month = labelEncod.fit_transform(Bank_DF.month)
Bank_DF.poutcome = labelEncod.fit_transform(Bank_DF.poutcome)
Bank_DF.Target = labelEncod.fit_transform(Bank_DF.Target)
Bank_DF.describe().T
#Deliverable – 2 (Prepare the data for analytics) – (10)
#Handle target variable inbalance
Bank_DF['Target'].value_counts(normalize=True)
Bank_DF_copy=Bank_DF.copy()
Bank_DF_copy_2=Bank_DF_copy[Bank_DF.Target==1]
Bank_DF_copy=pd.concat([Bank_DF_copy, Bank_DF_copy_2])
Bank_DF_copy=pd.concat([Bank_DF_copy, Bank_DF_copy_2])
Bank_DF_copy=pd.concat([Bank_DF_copy, Bank_DF_copy_2])
Bank_DF_copy=pd.concat([Bank_DF_copy, Bank_DF_copy_2])
Bank_DF_copy=pd.concat([Bank_DF_copy, Bank_DF_copy_2])
Bank_DF_copy=pd.concat([Bank_DF_copy, Bank_DF_copy_2])
Bank_DF_copy=pd.concat([Bank_DF_copy, Bank_DF_copy_2])
Bank_DF=Bank_DF_copy
Bank_DF
Bank_DF['Target'].value_counts(normalize=True)
#Get the data model ready.
#Independent variables
x=Bank_DF.drop('Target',axis=1)
#Dependent variable
y=Bank_DF['Target']
features = [col for col in Bank_DF.columns if col != 'Target']
#Create the training set and test set in ratio of 70:30
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.30,random_state=1)
#Normalize/Scale
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
scaler.fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)
StandardScaler(copy=True, with_mean=True, with_std=True)
#Deliverable – 3 (create the ensemble model) – (30)
#First create models using Logistic Regression and Decision Tree algorithm. Note the model performance by using
#different matrices. Use confusion matrix to evaluate
#class level metrics i.e. Precision/Recall. Also reflect the accuracy and F1 score of the model. (10 marks)
#Logistic Regression
logreg = LogisticRegression(random_state=1)
logreg.fit(x_train, y_train)
#Train data
print("Train: %.2f" % logreg.score(x_train, y_train))
#Test data
print("Test: %.2f" % logreg.score(x_test, y_test))
#predict
y_predict = logreg.predict(x_test)
#Confusion matrix
cm=metrics.confusion_matrix(y_test, y_predict, labels=[0, 1])
df_cm = pd.DataFrame(cm, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm, annot=True ,fmt='g')
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print(classification_report(y_test, y_predict))
#Result dataframe for final comparison
resultsDf = pd.DataFrame({'Method':['Logistic Regression'], 'Accuracy': var_accuracy,
'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
#Decision Tree
dt_Tree = DecisionTreeClassifier(criterion = 'entropy',random_state=1 )
dt_Tree.fit(x_train, y_train)
#Train data
print("Train: %.2f" % dt_Tree.score(x_train, y_train))
#Testt data
print("Test: %.2f" % dt_Tree.score(x_test, y_test))
print('High degree of overfitting spotted')
from sklearn.externals.six import StringIO
dot_data = StringIO()
export_graphviz(dt_Tree, out_file=dot_data,filled=True, rounded=True, special_characters=True,feature_names = features,class_names=['0','1'])
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('dt_Tree_Sales.png')
Image(graph.create_png())
y_predict = dt_Tree.predict(x_test)
#Confusion matrix
cm_DT=metrics.confusion_matrix(y_test, y_predict, labels=[0, 1])
df_cm_DT = pd.DataFrame(cm_DT, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(df_cm_DT, annot=True ,fmt='g')
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print(classification_report(y_test, y_predict))
tempResultsDf = pd.DataFrame({'Method':['Decision Tree'], 'Accuracy': var_accuracy, 'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
#Now to prune the Decision Tree
dt_Tree_pruned = DecisionTreeClassifier(criterion = "entropy", max_depth=6)
dt_Tree_pruned.fit(x_train, y_train)
#Train data
print("Train: %.2f" % dt_Tree_pruned.score(x_train, y_train))
#Test data
print("Test: %.2f" % dt_Tree_pruned.score(x_test, y_test))
dot_data = StringIO()
export_graphviz(dt_Tree_pruned, out_file=dot_data,filled=True, rounded=True,
special_characters=True,feature_names = features,class_names=['0','1'])
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_png('dt_Tree_Sales_Pruned.png')
Image(graph.create_png())
y_predict = dt_Tree_pruned.predict(x_test)
#Confusion matrix
Prune_cm_DT=metrics.confusion_matrix(y_test, y_predict, labels=[0, 1])
Prune_df_cm_DT = pd.DataFrame(Prune_cm_DT, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(Prune_df_cm_DT, annot=True ,fmt='g')
#Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print(classification_report(y_test, y_predict))
#Feature importance
feat_importance = dt_Tree_pruned.tree_.compute_feature_importances(normalize=False)
feat_imp_dict = dict(zip(features, dt_Tree_pruned.feature_importances_))
feat_imp = pd.DataFrame.from_dict(feat_imp_dict, orient='index')
feat_imp.sort_values(by=0, ascending=False)
tempResultsDf = pd.DataFrame({'Method':['Pruned Decision Tree'], 'Accuracy': var_accuracy, 'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
#Random Forest
rfcl = RandomForestClassifier(n_estimators = 50)
rfcl = rfcl.fit(x_train, y_train)
#Train data
print("Train: %.2f" % rfcl.score(x_train, y_train))
#Test data
print("Test: %.2f" % rfcl.score(x_test, y_test))
y_predict = rfcl.predict(x_test)
#Confusion matrix
RandomForest_cm_DT=metrics.confusion_matrix(y_test, y_predict, labels=[0, 1])
RandomForest_df_cm_DT = pd.DataFrame(RandomForest_cm_DT, index = [i for i in ["No","Yes"]],
columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(RandomForest_df_cm_DT, annot=True ,fmt='g')
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print(classification_report(y_test, y_predict))
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
#Maximum levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
#Minimum samples required to split a node
min_samples_split = [2, 5, 10]
#Minimum number required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'max_features': max_features,
'max_depth': max_depth,
'min_samples_split': min_samples_split,
'min_samples_leaf': min_samples_leaf,
'bootstrap': bootstrap}
print(random_grid)
#To get best model
#Set n_iter to 100, Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.
rf = RandomForestClassifier()
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(x_train, y_train)
#Checking best parameters
rf_random.best_params_
y_predict = rf_random.predict(x_test)
# Confusion matrix
Randomized_cm_DT=metrics.confusion_matrix(y_test, y_predict, labels=[0, 1])
Randomized_df_cm_DT = pd.DataFrame(Randomized_cm_DT, index = [i for i in ["No","Yes"]],columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(Randomized_df_cm_DT, annot=True ,fmt='g')
#Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print(classification_report(y_test, y_predict))
#Insert into result data frame
tempResultsDf = pd.DataFrame({'Method':['Random Forest'], 'Accuracy': var_accuracy, 'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
#Bagging
bgcl = BaggingClassifier(n_estimators=50, max_samples= .5, bootstrap=True, oob_score=True, random_state=22)
bgcl = bgcl.fit(x_train, y_train)
print("Train: %.2f" % bgcl.score(x_train, y_train))
print("Test: %.2f" % bgcl.score(x_test, y_test))
y_predict = bgcl.predict(x_test)
# Confusion matrix
Bagging_cm_DT=metrics.confusion_matrix(y_test, y_predict, labels=[0, 1])
Bagging_df_cm_DT = pd.DataFrame(Bagging_cm_DT, index = [i for i in ["No","Yes"]],columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(Bagging_df_cm_DT, annot=True ,fmt='g')
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print(classification_report(y_test, y_predict))
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
bootstrap = [True, False]
random_grid = {'n_estimators': n_estimators,
'bootstrap': bootstrap}
print(random_grid)
#To get best model
#Set n_iter to 100, Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.
bgcl = BaggingClassifier()
bgcl_random = RandomizedSearchCV(estimator = bgcl, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
bgcl_random.fit(x_train, y_train)
#Checking best parameters
bgcl_random.best_params_
#Predict
y_predict = bgcl_random.predict(x_test)
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print(classification_report(y_test, y_predict))
#Update data frame
tempResultsDf = pd.DataFrame({'Method':['Bagging'], 'Accuracy': var_accuracy,'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
#Boosting
abcl = AdaBoostClassifier(n_estimators = 100, learning_rate=0.1, random_state=22)
abcl = abcl.fit(x_train, y_train)
print("Train: %.2f" % abcl.score(x_train, y_train))
print("Test: %.2f" % abcl.score(x_test, y_test))
y_predict = abcl.predict(x_test)
# Confusion matrix
Boosting_cm_DT=metrics.confusion_matrix(y_test, y_predict, labels=[0, 1])
Boosting_df_cm_DT = pd.DataFrame(Boosting_cm_DT, index = [i for i in ["No","Yes"]],columns = [i for i in ["No","Yes"]])
plt.figure(figsize = (7,5))
sns.heatmap(Boosting_df_cm_DT, annot=True ,fmt='g')
#Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print(classification_report(y_test, y_predict))
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
random_grid = {'n_estimators': n_estimators}
print(random_grid)
#To get best model
#Set n_iter to 100, Number of parameter settings that are sampled. n_iter trades off runtime vs quality of the solution.
abcl = BaggingClassifier()
abcl_random = RandomizedSearchCV(estimator = abcl, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
abcl_random.fit(x_train, y_train)
#Predict
y_predict = abcl_random.predict(x_test)
# Check different metrics
print('Confusion Matrix')
print(confusion_matrix(y_test,y_predict))
var_recall=recall_score(y_test,y_predict)
print("Recall:",var_recall)
var_precision=precision_score(y_test,y_predict)
print("Precision:",var_precision)
var_f1=f1_score(y_test,y_predict)
print("F1 Score:",var_f1)
var_roc=roc_auc_score(y_test,y_predict)
print("Roc Auc Score:",var_roc)
var_accuracy=accuracy_score(y_test,y_predict)
print("Accuracy Score:",var_accuracy)
print(classification_report(y_test, y_predict))
#Update data frame
tempResultsDf = pd.DataFrame({'Method':['Adaboost'], 'Accuracy': var_accuracy,
'Recall': var_recall,'Precision': var_precision,'F1 Score': var_f1,'ROC AUC Score': var_roc})
resultsDf = pd.concat([resultsDf, tempResultsDf])
resultsDf = resultsDf[['Method', 'Accuracy','Recall','Precision','F1 Score','ROC AUC Score']]
resultsDf
#From the above dataframe we can see that:
#After performing CPU intesive RandomizedSearchCV with 100 sampled parameters
#Random Forest gives the best results with an accuracy of 98.22% with a good recall, precision, F1 score and ROC AUC
#The second best model is Bagging with accuracy of 96.58% and Adaboost wiht 96.57%